%matplotlib inline
import time
import os, os.path
import random
import cv2
import glob
import keras
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
import sys
import pandas as pd
import numpy as np
import pickle
from PIL import Image
import shutil
def create_classwise_distribution(dataset_filename):
class_names = os.listdir(dataset_filename)
data = list()
for class_name in class_names:
file_names = os.listdir(os.path.join(dataset_filename,class_name))
for file in file_names:
data.append({
"FileName":os.path.join(dataset_filename,class_name,file),
"ClassName":class_name,
"Number_of_Samples":len(file_names)
})
data = pd.DataFrame(data)
if not os.path.exists(os.path.join(os.getcwd(),"csv_files")):
os.makedirs(os.path.join(os.getcwd(),"csv_files"))
data.to_csv(os.path.join(os.getcwd(),"csv_files","class_wise_distribution.csv"),index=False)
# data_path = os.path.join(os.getcwd(),"data")
data_path = "D:/Content_Classification/Data"
create_classwise_distribution(data_path)
class_names = os.listdir(data_path)
data = pd.read_csv(os.path.join(os.getcwd(),"csv_files","class_wise_distribution.csv"))
data.head()
print(data.shape[0])
print("The number of total files ",data.shape[0])
data_classwise = pd.read_csv(os.path.join(os.getcwd(),"csv_files","data_classwise.csv"))
data_classwise = data_classwise[data_classwise['class_name']!="Data"]
data_classwise.to_csv(os.path.join(os.getcwd(),"csv_files","data_classwise.csv"),index = False)
data_classwise
data.shape
deficient_class_list = list()
for i in range(data_classwise.shape[0]):
if data_classwise.iloc[i,1]<50:
deficient_class_list.append(data_classwise.iloc[i,0])
print(deficient_class_list)
for class_name in deficient_class_list:
data = data[data['ClassName']!=class_name]
data_classwise = data_classwise[data_classwise['class_name']!=class_name]
print("The updated number of data files left",data.shape[0])
data.to_csv(os.path.join(os.getcwd(),"csv_files","filepath_truncated.csv"),index=False)
data = pd.read_csv(os.path.join(os.getcwd(),"csv_files","filepath_truncated.csv"))
print("The number of classes to be considered for first iteration",data_classwise.shape[0])
data_classwise
Now we create a function that loads all images in a directory for a given array of codes in one array and creates the corresponding label array for them.
Loaded images are resized to 224 x 224 before storing them in our array since this is the size preferred by VGG19 which we will be using later.
# Function returns an array of images whoose filenames start with a given set of characters after resizing them to 224 x 224
from tqdm import tqdm_notebook as tqdm
count = 0
def load_images(data):
images = []
labels = []
filename = []
for i in tqdm(range(data.shape[0])):
try:
class_name = data.loc[i,"ClassName"]
file_path = data.loc[i,"FileName"]
image = cv2.imread(file_path)
image = cv2.resize(image,(224,224))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
images.append(image)
labels.append(class_name)
filename.append(file_path)
except Exception as e:
print(str(e))
print("Raising an exception")
count+=1
return images,labels,filename
images,labels,filenames = load_images(data)
print(count)
As the previous execution was very much time consuming so to avoid running the same iterations again and again,we will save the image,labels,filenames in the pickle files
if not os.path.exists(os.path.join(os.getcwd(),"pickle_files")):
os.makedirs(os.path.join(os.getcwd(),"pickle_files"))
with open(os.path.join(os.getcwd(),"pickle_files","images_list.pkl"), "wb") as fp: #Pickling
pickle.dump(images, fp)
with open(os.path.join(os.getcwd(),"pickle_files","labels_list.pkl"), "wb") as fp:
pickle.dump(labels, fp)
with open(os.path.join(os.getcwd(),"pickle_files","filenames_list.pkl"), "wb") as fp:
pickle.dump(filenames, fp)
with open(os.path.join(os.getcwd(),"pickle_files","images_list.pkl"), "rb") as fp: # Unpickling
images = pickle.load(fp)
with open(os.path.join(os.getcwd(),"pickle_files","labels_list.pkl"), "rb") as fp:
labels = pickle.load(fp)
with open(os.path.join(os.getcwd(),"pickle_files","filenames_list.pkl"), "rb") as fp:
filenames = pickle.load(fp)
def show_random_images(images, labels, number_of_images_to_show=2):
for labels_iter in list(set(labels)):
indicies = [i for i, label in enumerate(labels) if label == labels_iter]
random_indicies = [random.choice(indicies) for i in range(number_of_images_to_show)]
figure, axis = plt.subplots(1, number_of_images_to_show)
print("{} random images for code {}".format(number_of_images_to_show, labels_iter))
for image in range(number_of_images_to_show):
axis[image].imshow(images[random_indicies[image]])
plt.show()
show_random_images(images, labels)
We now convert the images and labels to NumPy arrays to make processing them easier. We then normaise the images before passing them on to VGG19
def normalise_images(images, labels):
# Convert to numpy arrays
images = np.array(images, dtype=np.float32)
labels = np.array(labels)
# Normalise the images
images /= 255
return images, labels
images, labels = normalise_images(images, labels)
X_train, y_train,filenames = images,labels,filenames
# Load the models with ImageNet weights
# vgg16_model = keras.applications.vgg16.VGG16(include_top=False, weights="imagenet", input_shape=(224,224,3))
vgg19_model = keras.applications.vgg19.VGG19(include_top=False, weights="imagenet", input_shape=(224,224,3))
# resnet50_model = keras.applications.resnet50.ResNet50(include_top=False, weights="imagenet", input_shape=(224,224,3))
def covnet_transform(covnet_model, raw_images):
# Pass our training data through the network
pred = covnet_model.predict(raw_images)
# Flatten the array
flat = pred.reshape(raw_images.shape[0], -1)
return flat
# vgg16_output = covnet_transform(vgg16_model, X_train)
# print("VGG16 flattened output has {} features".format(vgg16_output.shape[1]))
vgg19_output = covnet_transform(vgg19_model, X_train)
print("VGG19 flattened output has {} features".format(vgg19_output.shape[1]))
# resnet50_output = covnet_transform(resnet50_model, X_train)
# print("ResNet50 flattened output has {} features".format(resnet50_output.shape[1]))
The above cell shows us the number of features each covnet gives to a single image. When we compare these to the original size of the image 224 x 224 x 3 = 150,528 pixels/features, we can see that this is a large reduction in what the clustering algorithms will have to work with.
We need to do Dimensionality reduction as Kmeans is able to produce the results with these large dimensions but not GMM
# Function that creates a PCA instance, fits it to the data and returns the instance
def create_fit_PCA(data, n_components=None):
p = PCA(n_components=n_components, random_state=728)
p.fit(data)
return p
# Create PCA instances for each covnet output
# vgg16_pca = create_fit_PCA(vgg16_output)
vgg19_pca = create_fit_PCA(vgg19_output)
# resnet50_pca = create_fit_PCA(resnet50_output)
# Function to plot the cumulative explained variance of PCA components
# This will help us decide how many components we should reduce our features to
def pca_cumsum_plot(pca,title_name):
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title(title_name)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()
# Plot the cumulative explained variance for each covnet
# pca_cumsum_plot(vgg16_pca,title_name = "VGG16")
pca_cumsum_plot(vgg19_pca,title_name = "VGG19")
# pca_cumsum_plot(resnet50_pca,title_name = "RESNET50")
Looking at the gaphs above, we can see that PCA can explain almost all the variance in as many dimensions as there are samples.
It is also interesting to note the difference in shape between the VGG graphs and the ResNet one. This is probably due to the fact that ResNet only had 2048 dimensions to start with, while VGGs had 25,088
# PCA transformations of covnet outputs
# vgg16_output_pca = vgg16_pca.transform(vgg16_output)
vgg19_output_pca = vgg19_pca.transform(vgg19_output)
# resnet50_output_pca = resnet50_pca.transform(resnet50_output)
def create_train_kmeans(data, number_of_clusters=data_classwise.shape[0]):
# n_jobs is set to -1 to use all available CPU cores. This makes a big difference on an 8-core CPU
# especially when the data size gets much bigger. #perfMatters
k = KMeans(n_clusters=number_of_clusters, n_jobs=-1, random_state=728)
# Let's do some timings to see how long it takes to train.
start = time.time()
# Train it up
k.fit(data)
# Stop the timing
end = time.time()
# And see how long that took
print("Training took {} seconds".format(end-start))
return k
# def create_train_gmm(data, number_of_clusters=len(os.listdir(data_path))):
# g = GaussianMixture(n_components=number_of_clusters, covariance_type="full", random_state=728)
# start=time.time()
# g.fit(data)
# end=time.time()
# print("Training took {} seconds".format(end-start))
# return g
print("KMeans (PCA): \n")
print("\nVGG19")
K_vgg19_pca = create_train_kmeans(vgg19_output_pca)
print("KMeans: \n")
print("\nVGG19:")
K_vgg19 = create_train_kmeans(vgg19_output)
k_vgg19_pred_pca = K_vgg19_pca.predict(vgg19_output_pca)
k_vgg19_pred = K_vgg19.predict(vgg19_output)
Remember that the clustering algorith does not detect which images are cats and which are dogs, it only groups images that look alike together and assigns them a number arbitrarily.
We now need to count how many of each label are in each cluster, this way we can take a look and if sufficient eperation has happened we can quicly see which cluster is which label. So let's write a function that does that.
def cluster_label_count(clusters, labels):
count = {}
# Get unique clusters and labels
unique_clusters = list(set(clusters))
unique_labels = list(set(labels))
# Create counter for each cluster/label combination and set it to 0
for cluster in unique_clusters:
count[cluster] = {}
for label in unique_labels:
count[cluster][label] = 0
# Let's count
for i in range(len(clusters)):
count[clusters[i]][labels[i]] +=1
cluster_df = pd.DataFrame(count)
return cluster_df
# VGG19 KMeans
vgg19_cluster_count = cluster_label_count(k_vgg19_pred, y_train)
vgg19_cluster_count_pca = cluster_label_count(k_vgg19_pred_pca, y_train)
# print("KMeans VGG19: ")
# vgg19_cluster_count.to_csv('cluster"s_result.csv',index = False)
vgg19_cluster_count
# np.sum(vgg19_cluster_count.iloc[0,:])
# clusters_result = pd.read_csv('clusters_result.csv')
# clusters_result
print("KMeans VGG19 (PCA): ")
vgg19_cluster_count_pca
Relative purity of a cluster is defined as
relative purity of cluster 1 w.r.t class_a = (number of samples of class_a in the cluster 1/total number of samples in cluster 1)*100
which implies that 100% value indicates that the cluster does not contain samples from any other class
We set the benchmark for purity of a cluster a relative percentage to be minimum 90 for first iteration to be identified as a pure cluster
data_relative_purity = vgg19_cluster_count.copy()
relative_purity = list()
for i in range(vgg19_cluster_count.shape[0]):
for j in range(vgg19_cluster_count.shape[0]):
data_relative_purity.iloc[i,j] = np.round(data_relative_purity.iloc[i,j]/np.sum(data_relative_purity.iloc[:,j]),2)*100
data_relative_purity.to_csv(os.path.join(os.getcwd(),"csv_files","relative_purity.csv"),index=True)
data_relative_purity
data_relative_purity.shape
# data_classwise = pd.read_csv(os.path.join(os.getcwd(),"csv_files","data_classwise.csv"))
# data_classwise.index = data_classwise['class_name'].tolist()
# data_classwise.iloc[data_classwise['class_name'].tolist(),1]
# i = 0
# data_relative_purity.index.values[i]
data_pure_clusters = list()
for i in range(data_relative_purity.shape[0]):
for j in range(data_relative_purity.shape[0]):
if data_relative_purity.iloc[i,j]>90:
classPercentage = np.round(vgg19_cluster_count.iloc[i,j]/np.sum(vgg19_cluster_count.iloc[i,:]),2)*100
# print(np.sum(vgg19_cluster_count.iloc[i,:]),2)
# print(data.index.values[i],end=' ')
# print("Cluster number-->",i)
# print(data.iloc[j,i])
data_pure_clusters.append({
"ClassName":data_relative_purity.index.values[i],
"ClusterNumber":j,
"PurityPercentage":data_relative_purity.iloc[i,j],
"ClassPercentage":classPercentage
})
data_pure_clusters = pd.DataFrame(data_pure_clusters)
data_pure_clusters.to_csv(os.path.join(os.getcwd(),"csv_files","data_pure_clusters.csv"),index=False)
data_pure_classes = pd.read_csv(os.path.join(os.getcwd(),"csv_files","data_pure_clusters.csv"))
data_pure_classes
vgg19_cluster_count
pure_clusters_list = list()
class_name = "PAN"
clusters_name_list = list()
for i in range(vgg19_cluster_count.shape[0]):
if vgg19_cluster_count.index[i]==class_name:
for j in range(vgg19_cluster_count.shape[1]):
if vgg19_cluster_count.iloc[i,j]>0:
clusters_name_list.append(j)
print(clusters_name_list)
pure_clusters_list = clusters_name_list.copy()
def ClusterIndicesNumpy(clustNum, labels_array): #numpy
return np.where(labels_array == clustNum)[0]
# print(ClusterIndicesNumpy(2, K_vgg19.labels_))
# class_index_list = ClusterIndicesNumpy(5, K_vgg19.labels_)
# print(class_index_list)
# print(len(class_index_list))
print(np.unique(K_vgg19.labels_))
pure_files_list = list()
for pure_clusters in pure_clusters_list:
pure_files_list.append(ClusterIndicesNumpy(pure_clusters, K_vgg19.labels_))
pure_files_list = np.array(pure_files_list)
print(pure_files_list.shape[0])
print(pure_files_list[0].shape[0])
# print(pure_files_list)
pure_files_list.shape[0]
# pure_files_list[0]
main_cluster = list()
# pure_files_list[0]
class_name = "PAN"
for i in range(pure_files_list.shape[0]):
# cluster_number = pure_clusters_list[i]
per_cluster = list()
for j in pure_files_list[i]:
# print(data.iloc[j,1])
if data.iloc[j,1]==class_name:
per_cluster.append(j)
main_cluster.append(per_cluster)
print(main_cluster)
main_cluster = np.array(main_cluster)
print(main_cluster.shape)
print(len(main_cluster[0]))
print(len(main_cluster[1]))
print(len(main_cluster[2]))
if os.path.exists(os.path.join(os.getcwd(),"clusters")):
shutil.rmtree(os.path.join(os.getcwd(),"clusters"), ignore_errors=False, onerror=None)
for row in range(main_cluster.shape[0]):
if not os.path.exists(os.path.join(os.getcwd(),"clusters","pan"+str(row))):
os.makedirs(os.path.join(os.getcwd(),"clusters","pan"+str(row)))
for column in range(len(main_cluster[row])):
# print(column)
# print(row,column)
shutil.copy(data.iloc[main_cluster[row][column],0],os.path.join(os.getcwd(),"clusters","pan"+str(row)))
from PIL import Image
Image.open(data.iloc[main_cluster[0][0],0])